Import Data Set

 severeinjury <- read_csv("~/Downloads/severeinjury.csv", 
    col_types = cols(Address2 = col_skip(), 
    EventDate = col_date(format = "%m/%d/%Y"), 
    ID = col_character(), Inspection = col_skip()))
colnames(severeinjury)[colnames(severeinjury)=="Primary NAICS"] <- "naics"
colnames(severeinjury)[colnames(severeinjury)=="Part of Body Title"] <- "bodyPart"
severeinjury$month = month(severeinjury$EventDate)
severeinjury$year= year(severeinjury$EventDate)

Cleaning

Nulls

Nulls= severeinjury%>%
  summarise_all(funs(sum(is.na(.))))
Nulls=melt(Nulls)
## No id variables; using all as measure variables
kable(Nulls)
variable value
ID 0
UPA 0
EventDate 0
Employer 0
Address1 11
City 11
State 0
Zip 13
Latitude 4
Longitude 4
naics 3
Hospitalized 0
Amputation 2
Final Narrative 0
Nature 0
NatureTitle 0
Part of Body 0
bodyPart 0
Event 0
EventTitle 0
Source 0
SourceTitle 0
Secondary Source 23509
Secondary Source Title 23509
month 0
year 0

There are some columns that are almost entirely null. For right now, I am leaving everything because I do not know what we might want to do in the future. It would make sense for secondary source to be mainly nulll, because sometimes there is not a secondary source.

From this, we also see that there are three records that do not have a NAISC code. Since we are going to be breaking up the records into two different files, the service sector and the non service sector, we will need to get rid of those.

severeinjury = severeinjury %>%
  drop_na(naics)

I want to make sure that all the records have a full 6 digit naics code before I remove them.

severeWithdigits = severeinjury %>%
  mutate(NumberOfNaics = nchar(naics))

max(severeWithdigits$NumberOfNaics)
## [1] 6
min(severeWithdigits$NumberOfNaics)
## [1] 2

The max number of digits is 6 which is good, but the minimum is 2. Let’s check how many records have less than 6.

sum(severeWithdigits$NumberOfNaics <= 5)
## [1] 131

There are 131 records that have naisc codes that either need to be handled seperately or just thrown out, for now I am going to remove them by putting them in a different data set.

NotRealNaisc = severeWithdigits %>%
  filter( NumberOfNaics <= 5)

Injury = severeWithdigits %>%
  filter( NumberOfNaics == 6)

Seperate the data sets into Service and NonService

Utilities = Injury %>% 
  filter(naics >= 220000 & naics <230000)
  
Service = Injury %>%
  filter( naics >= 400000)

Service = rbind(Service, Utilities)

NonServiceTop = Injury %>%
  filter(naics < 400000 & naics >= 230000) 

NonServiceBottom = Injury %>%
  filter(naics < 220000)

NonService = rbind(NonServiceTop, NonServiceBottom)

Percentage Hospitalized Injuries in Service and NonService Job Sectors

sum(NonService$Hospitalized)/nrow(NonService)
## [1] 0.7574279
sum(Service$Hospitalized)/nrow(Service)
## [1] 0.8735039

WRITE SOMETHING HERE

Top 10 Nature Titles for Service

Top10Service = Service %>%
  group_by(NatureTitle) %>%
  summarize( countSer = n()) %>%
  filter(countSer > 190) %>%
  mutate(sector = "NonService")

ggplot(Top10Service, aes(reorder(NatureTitle, countSer), countSer))+geom_bar(stat= "identity")+coord_flip()

Top 10 Nature Titles for NonService

Top10NonService = NonService %>%
  group_by(NatureTitle) %>%
  summarize( countNon = n()) %>%
  filter(countNon > 250) %>%
  mutate(sector = "NonService")

ggplot(Top10NonService, aes(reorder(NatureTitle, countNon), countNon))+geom_bar(stat= "identity")+coord_flip()

The top 4 of the categories remain the same for both sectors. It is after that that we see the differences.

Body Parts

Top10BodyPartService = Service %>%
  group_by(bodyPart) %>%
  summarize( count = n()) %>%
  filter(count >500)

ggplot(Top10BodyPartService, aes(reorder(bodyPart, count), count))+geom_bar(stat= "identity")+coord_flip()

Top10BodyPartNonService = NonService %>%
  group_by(bodyPart) %>%
  summarize( count = n()) %>%
  filter(count >500)

ggplot(Top10BodyPartNonService, aes(reorder(bodyPart, count), count))+geom_bar(stat= "identity")+coord_flip()

Time Data

Lets look and see if there are any points from the last 3 years that have extremely high points of data.

ServiceDate = Service %>%
  group_by(month,year) %>%
  summarize(numInjuries = n()) %>%
  mutate(FullDate = paste(month,year, sep = "/"))

NonServiceDate = NonService %>%
  group_by(month,year) %>%
  summarize(numInjuries = n()) %>%
  mutate(FullDate = paste(month,year, sep = "/"))

p=ggplot()+ geom_line(data=ServiceDate, aes(group=1,x= month,y= numInjuries), color = "blue")+ geom_line(data=NonServiceDate, aes(group=1,x= month,y= numInjuries), color = "red")+facet_wrap(~ year, ncol = 4)
ggplotly(p)

Something interesting about this is they follow a very similar pattern. That really isn’t something I would expect. I would have thought that they would be a bit more random, however the tredns seem to follow eachother. Which makes me think there is some outside force that is making them do more reports that month etc. We seem to see spikes in September and october.

What happened in Auguest of 2016?

Top10ServiceAug2016 = Service %>%
  filter(month == 8, year == 2016) %>%
  group_by(NatureTitle) %>%
  summarize( countSer = n()) %>%
  filter(countSer>10)
  

ggplot(Top10ServiceAug2016, aes(reorder(NatureTitle, countSer), countSer))+geom_bar(stat= "identity")+coord_flip()

Top10NonServiceAug2016 = NonService %>%
  filter(month == 8, year == 2016) %>%
  group_by(NatureTitle) %>%
  summarize( count = n()) %>%
  filter(count >15)
  

ggplot(Top10NonServiceAug2016, aes(reorder(NatureTitle, count), count))+geom_bar(stat= "identity")+coord_flip()

We see a lot more heat and light injuries, I wonder if the temeperature was extremely high that year in september?

InjuryAug = Injury %>%
  filter( month ==8, year == 2016)
nrow(InjuryAug)/nrow(Injury)
## [1] 0.03325728

What is the top injury per state?

Do we have an even distribution of records across states?

ServiceStates = Service %>%
  group_by(State) %>%
  summarise(ServiceRecords = n())
NonServiceStates = NonService %>%
  group_by(State)%>%
  summarise(NonServiceRecords = n())
StatesRecords = left_join(ServiceStates, NonServiceStates)
## Joining, by = "State"
#View(StatesRecords)
ServiceStates$ServiceRank= NA
ServiceStates$ServiceRank[order(-ServiceStates$ServiceRecords)]= 1:nrow(ServiceStates)

NonServiceStates$NonServiceRank= NA
NonServiceStates$NonServiceRank[order(-NonServiceStates$NonServiceRecords)]= 1:nrow(NonServiceStates)


StateRecords = left_join(ServiceStates, NonServiceStates)
## Joining, by = "State"

Top 20 states for service and non service are different. This is semi interesting.

A rank of 1 means that that state had the most records.

StateRecords = StateRecords %>%
  select(State,ServiceRank, NonServiceRank)

kable(StateRecords)
State ServiceRank NonServiceRank
ALABAMA 14 9
ALASKA 49 43
AMERICAN SAMOA 54 44
ARIZONA 35 37
ARKANSAS 15 12
CALIFORNIA 21 32
COLORADO 9 10
CONNECTICUT 20 23
DELAWARE 32 26
DISTRICT OF COLUMBIA 29 31
FLORIDA 2 2
GEORGIA 7 6
GUAM 50 42
HAWAII 44 41
IDAHO 24 21
ILLINOIS 5 5
INDIANA 45 NA
IOWA 47 NA
KANSAS 16 16
KENTUCKY 46 34
LOUISIANA 13 13
MAINE 22 25
MARYLAND 36 35
MASSACHUSETTS 11 17
MICHIGAN 37 51
MINNESOTA 43 52
MISSISSIPPI 19 15
MISSOURI 10 11
MONTANA 27 28
NEBRASKA 18 19
NEVADA 51 NA
NEW HAMPSHIRE 26 27
NEW JERSEY 8 18
NEW MEXICO 39 46
NEW YORK 6 8
NORTH CAROLINA 38 36
NORTH DAKOTA 25 20
NORTHERN MARIANA ISLANDS 53 48
OHIO 4 3
OKLAHOMA 17 14
OREGON 40 38
PENNSYLVANIA 3 4
PUERTO RICO 55 49
RHODE ISLAND 31 30
SOUTH CAROLINA 42 39
SOUTH DAKOTA 28 24
TENNESSEE 34 40
TEXAS 1 1
UTAH 41 50
VERMONT 52 NA
VIRGINIA 30 29
WASHINGTON 33 33
WEST VIRGINIA 23 22
WISCONSIN 12 7
WYOMING 48 45